import json
import math
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from collections import Counter
from statsmodels.stats.proportion import proportions_ztest
import statsmodels.api as sm
import matplotlib.pyplot as plt
from collections import Counter

#=======================================================================#
#
# Table A3: Did Humans Retweet Bots?
#
#=======================================================================#

df = pd.read_csv('data/retweet_network.csv', sep='\t',header=0)
ct = pd.crosstab(df.tweeter, df.retweeter)
c, p, dof, expected = chi2_contingency(ct)

t1 = pd.crosstab(df.tweeter, df.retweeter)
t2 = pd.crosstab(df.tweeter, df.retweeter, normalize='index')

results = ['%d (%0.1f%%)' %(t1.loc['bot','bot'], t2.loc['bot','bot']*100),
           '%d (%0.1f%%)' %(t1.loc['human','bot'], t2.loc['human','bot']*100),
           '%d (%0.1f%%)' %(t1.loc['bot','human'], t2.loc['bot','human']*100),
           '%d (%0.1f%%)' %(t1.loc['human','human'], t2.loc['human','human']*100),
           '%d (100%%)' %(t1.sum(axis=1)[0]),
           '%d (100%%)' %(t1.sum(axis=1)[1])]
results = pd.DataFrame({'Tweeter':['Bot','Human'], 'Retweeter: Bot': results[0:2],
                        'Retweeter: Human': results[2:4], 'Total': results[4:6]})

with open('tables/tableA3.txt','w') as fout:
    print(results.to_string(index=False), file=fout)
    print('$\chi^2$ = %0.1f; $p = %0.3f$' %(c, p), file=fout)

#=======================================================================#
#
# Figure A1: Top URL Links Shared by Social Bots and Retweeted by Humans
#
#=======================================================================#

df = df[pd.notnull(df.urls)]
burls = [item for sublist in [d.split() for d in df[(pd.notnull(df.urls)) & (df.tweeter=='bot')].urls.tolist()] for item in sublist]
urlbot = Counter(burls)
most_shared = urlbot.most_common(50)

# Saving most commonly shared links
with open('data/url_influence.csv','w') as fout:
    for idx, (u,c) in enumerate(most_shared):
        ct = pd.crosstab(df[df.urls.str.contains(u)].tweeter, df[df.urls.str.contains(u)].retweeter)
        try:
            n = ct.loc['bot','human'] + ct.loc['bot','bot'] + ct.loc['human','human'] + ct.loc['human','bot']
            cc = ct.loc['bot','human']
            y0 = cc/n
            se = math.sqrt(y0*(1-y0)/n)
            vals = (round(100*(y0), 4), round(100*(y0-1.96*se), 4), round(100*(y0+1.96*se),4))
            fout.write(u + ',' + str(c) + ',' + str(cc) + ',' + str(y0) + ',' + str(se) + ',' + str(vals[1]) + ',' + str(vals[2]) + '\n')
        except:
            continue

# Creating Figure

with open('data/url_map.json') as fin:
    url_dict = json.load(fin)
df = pd.read_csv('data/url_influence.csv',header=None,
                names=['link','shares','human','prop','se','lb','ub'])

df['y'] = [float(x)*100 for x in df.prop]
df['desc'] = df.link.map(url_dict)
df = df[pd.notnull(df.desc)]
df['human'] = df.human.astype(int)
df['lb'] = df.lb.astype(float)
df['ub'] = df.ub.astype(float)

df['err'] = df['y']-df['lb']
df = df.sort_values(by='y', ascending=True).reset_index(drop=True)
df = df.tail(30)

plt.figure(figsize=(21, 21))

plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=20)

plt.barh(df['desc'], df['y'], xerr=df['err'], edgecolor='black', linewidth=2, facecolor=(102/255,0,51/255,0.8),alpha=0.75)

plt.xlabel("Percentage (total number) of URL links posted by bots and retweeted by humans", fontsize=24)
plt.yticks(fontsize=20);
plt.xticks(fontsize=20);
plt.xlim((0, 100));
plt.xticks(range(0, 100, 5));

for xval, pos, n in zip(df['desc'], df['y'], df['human']):
    plt.annotate(n,xy=(pos,xval),xytext=(85,-10),fontsize=18,textcoords='offset points', ha='right', va='bottom',
                bbox=dict(boxstyle='round,pad=0.1', fc='black', alpha=0.1));

plt.savefig('figures/figureA1.jpeg', dpi=200, bbox_inches='tight');

